section .text
bits 64

; Win64 calling convention:
;	input parameters: rcx, rdx, r8, r9, [rsp+28h], [rsp+30h], ...
;	output parameter: rax
;	stack contains: [rsp] (8) return, [rsp+8h] (20h) "shadow space" to preserve registers
;	need to preserve registers: rbx, rsi, rdi, r12, r13, r14, r15, rbp
;	may destroy registers: rax, rcx, rdx, r8, r9, r10, r11

; Caller need to push parameters to the stack (if needed)
; and then allocate free space in stack of size 20h bytes (sub rsp,20h).

; Function must align stack to 16 bytes boundary (= do 1 push)
; and preserve registers rbx, rsi, rdi, r12, r13, r14, r15, rbp

%assign	LOOPBIT	5
%assign LOOPNUM (1 << LOOPBIT)
%assign LOOPMASK (LOOPNUM - 1)

; =============================================================================
;                       CHECK compiler compatibility
; =============================================================================
; inputs: rcx=par1 (123), rdx=par2 (456), r8=par3 (789), r9=par4 (321), [rsp+28h]=par5 (654), [rsp+30h]=par6 (987)
; output: rax=output 1=OK, 0=not

global CheckComp_x64

CheckComp_x64:
		xor		rax,rax				; RAX <- 0 error
		cmp		rcx,123
		jne		CheckComp_x64_8	
		cmp		rdx,456
		jne		CheckComp_x64_8	
		cmp		r8,789
		jne		CheckComp_x64_8	
		cmp		r9,321
		jne		CheckComp_x64_8	
		cmp		qword [rsp+28h],654
		jne		CheckComp_x64_8	
		cmp		qword [rsp+30h],987
		jne		CheckComp_x64_8	
		inc		rax					; RAX <- 1 ok
CheckComp_x64_8:
		ret

; =============================================================================
;                           FILL data string
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64 val, r8=s64 len

global FillStr_x64

%if 1	; 1=use "rep stosq" method, 0=use "mov" method
; In this case "rep stosq" if faster than "mov" method, use "1" flag.

; ----- fill data string using "rep stosq"
; RDI = dst, RAX = val, RCX = loop counter

FillStr_x64:
		test	r8,r8				; test R8 len
		jle		FillStr_x64_4		; no data
		push	rdi					; push RDI
		mov		rdi,rcx				; RDI <- dst
		mov		rax,rdx				; RAX <- val
		mov		rcx,r8				; RCX <- len
		rep		stosq				; fill string
		pop		rdi					; pop RDI
FillStr_x64_4:
		ret
		
%else	; method "mov"

; ----- prepare registers to big loop
; R9 = dst, RDX = val, RCX = loop counter, R8 = old len

FillStr_x64:
		test	r8,r8				; test R8 len
		jle		FillStr_x64_8		; no data
		mov		r9,rcx				; R9 <- dst
		mov		rcx,r8				; RCX <- len
		shr		rcx,LOOPBIT			; RCX <- number of big loops
		jz		FillStr_x64_4		; no big loop

; ----- big loop
		
FillStr_x64_2:
%assign LOOPINX 0
%rep LOOPNUM
		mov		[r9+LOOPINX],rdx
%assign LOOPINX LOOPINX+8
%endrep
		add		r9,8*LOOPNUM		; shift dst
		dec		rcx					; loop counter
		jnz		FillStr_x64_2		; loop next step

; ----- prepare registers to small loop

FillStr_x64_4:
		mov		rcx,r8				; RCX <- len
		and		rcx,LOOPMASK		; RCX <- number of small loops
		jz		FillStr_x64_8		; no small loop

; ----- small loop

FillStr_x64_6:
		mov		[r9],rdx
		add		r9,8				; shift dst
		loop	FillStr_x64_6,rcx	; loop next step

FillStr_x64_8:
		ret

%endif

; =============================================================================
;                           COPY data string
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src, r8=s64 len

global CopyStr_x64

%if 1	; 1=use "rep movsq" method, 0=use "mov" method
; AMD: "rep movsq" is faster, Intel: "mov" is faster.

; ----- copy data string using "rep movsq"
; RDI = dst, RSI = src, RCX = loop counter

CopyStr_x64:
		test	r8,r8				; test R8 len
		jle		CopyStr_x64_4		; no data
		push	rsi					; push RSI
		push	rdi					; push RDI
		mov		rsi,rdx				; RSI <- src
		mov		rdi,rcx				; RDI <- dst
		mov		rcx,r8				; RCX <- len
		rep		movsq				; copy string
		pop		rdi					; pop RDI
		pop		rsi					; pop RSI
CopyStr_x64_4:
		ret
		
%else	; method "mov"

; ----- prepare registers to big loop
; RAX = temporary, R9 = dst, RDX = src, RCX = loop counter, R8 = old len

CopyStr_x64:
		test	r8,r8				; test R8 len
		jle		CopyStr_x64_8		; no data
		mov		r9,rcx				; R9 <- dst
		mov		rcx,r8				; RCX <- len
		shr		rcx,LOOPBIT			; RCX <- number of big loops
		jz		CopyStr_x64_4		; no big loop

; ----- big loop
		
CopyStr_x64_2:
%assign LOOPINX 0
%rep LOOPNUM
		mov		rax,[rdx+LOOPINX]
		mov		[r9+LOOPINX],rax
%assign LOOPINX LOOPINX+8
%endrep
		add		rdx,8*LOOPNUM		; shift src
		add		r9,8*LOOPNUM		; shift dst
		dec		rcx					; loop counter
		jnz		CopyStr_x64_2		; loop next step

; ----- prepare registers to small loop

CopyStr_x64_4:
		mov		rcx,r8				; RCX <- len
		and		rcx,LOOPMASK		; RCX <- number of small loops
		jz		CopyStr_x64_8		; no small loop

; ----- small loop

CopyStr_x64_6:
		mov		rax,[rdx]
		mov		[r9],rax
		add		rdx,8				; shift src
		add		r9,8				; shift dst
		loop	CopyStr_x64_6,rcx	; loop next step

CopyStr_x64_8:
		ret

%endif

; =============================================================================
;                     COPY data string in DOWN direction
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src, r8=s64 len

global CopyDownStr_x64

%if 0	; 1=use "rep movsq" method, 0=use "mov" method
; In this case "rep movsq" is slower than "mov" method, use "0" flag.

; ----- copy data string using "rep movs"
; RDI = dst, RSI = src, RCX = loop counter

CopyDownStr_x64:
		test	r8,r8				; test R8 len
		jle		CopyDownStr_x64_4	; no data
		push	rsi					; push RSI
		push	rdi					; push RDI
		lea		rsi,[rdx+r8*8-8]	; RSI <- src last entry
		lea		rdi,[rcx+r8*8-8]	; RDI <- dst last entry
		mov		rcx,r8				; RCX <- len
		std							; direction DOWN
		rep		movsq				; copy string
		cld							; direction UP (default)
		pop		rdi					; pop RDI
		pop		rsi					; pop RSI
CopyDownStr_x64_4:
		ret
		
%else	; method "mov"

; ----- prepare registers to big loop
; RAX = temporary, R9 = dst, RDX = src, RCX = loop counter, R8 = old len

CopyDownStr_x64:
		test	r8,r8				; test R8 len
		jle		CopyDownStr_x64_8	; no data
		lea		r9,[rcx+r8*8-8]		; R9 <- dst last entry
		lea		rdx,[rdx+r8*8-8]	; RDX <- src last entry
		mov		rcx,r8				; RCX <- len
		shr		rcx,LOOPBIT			; RCX <- number of big loops
		jz		CopyDownStr_x64_4	; no big loop

; ----- big loop
		
CopyDownStr_x64_2:
%assign LOOPINX 0
%rep LOOPNUM
		mov		rax,[rdx-LOOPINX]
		mov		[r9-LOOPINX],rax
%assign LOOPINX LOOPINX+8
%endrep
		sub		rdx,8*LOOPNUM		; shift dst
		sub		r9,8*LOOPNUM		; shift src
		dec		rcx					; loop counter
		jnz		CopyDownStr_x64_2	; loop next step

; ----- prepare registers to small loop

CopyDownStr_x64_4:
		mov		rcx,r8				; RCX <- len
		and		rcx,LOOPMASK		; RCX <- number of small loops
		jz		CopyDownStr_x64_8	; no small loop

; ----- small loop

CopyDownStr_x64_6:
		mov		rax,[rdx]
		mov		[r9],rax
		sub		rdx,8				; shift dst
		sub		r9,8				; shift src
		loop	CopyDownStr_x64_6,rcx ; loop next step

CopyDownStr_x64_8:
		ret

%endif

; =============================================================================
;                   SCAN EQU data string in UP direction
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64 val, r8=s64 len
; output: rax=s64 length of equal data

global ScanEquStr_x64

%if 0	; 1=use "repe scasq" method, 0=use "mov" method
; In this case "repe scasq" is slower than "mov" method, use "0" flag.

; ----- fill data string using "rep scasq"
; RDI = dst, RAX = val, RCX = loop counter, R8 = old len

ScanEquStr_x64:
		test	r8,r8				; test R8 len
		jle		ScanEquStr_x64_9	; no data
		push	rdi					; push RDI
		mov		rdi,rcx				; RDI <- dst
		mov		rax,rdx				; RAX <- val
		mov		rcx,r8				; RCX <- len
		repe	scasq				; scan string
		pop		rdi					; pop RDI
		je		ScanEquStr_x64_4	; operation break with EQU
		inc		rcx					; add last loop back
ScanEquStr_x64_4:
		mov		rax,r8				; RAX <- old len
		sub		rax,rcx				; RAX <- length of equal data
		ret

ScanEquStr_x64_9:
		xor		rax,rax				; RAX <- 0, no data
		ret
		
%else	; method "mov"

; ----- prepare registers to big loop
; R9 = dst, RDX = val, RCX = loop counter, R8 = old len

ScanEquStr_x64:
		test	r8,r8				; test R8 len
		jle		ScanEquStr_x64_9	; no data
		mov		r9,rcx				; R9 <- dst
		mov		rcx,r8				; RCX <- len
		sub		rcx,LOOPNUM			; RCX <- check number of big loops
		jl		ScanEquStr_x64_4	; no big loop

; ----- big loop
		
ScanEquStr_x64_2:
%assign LOOPINX 0
%assign LOOPLAB	0
%rep LOOPNUM
		cmp		rdx,[r9+LOOPINX]
		jne		ScanEquStr_x64_res_ %+ LOOPLAB
%assign LOOPINX LOOPINX+8
%assign LOOPLAB	LOOPLAB+1
%endrep
		add		r9,8*LOOPNUM		; shift dst
		sub		rcx,LOOPNUM			; loop counter
		jge		ScanEquStr_x64_2	; loop next step

; ----- prepare registers to small loop

ScanEquStr_x64_4:
		add		rcx,LOOPNUM			; return correct number of remaining loops
		jz		ScanEquStr_x64_8	; no small loop

; ----- small loop

ScanEquStr_x64_6:
		cmp		rdx,[r9]
		jne		ScanEquStr_x64_res_00
		add		r9,8				; shift dst
		loop	ScanEquStr_x64_6,rcx	; loop next step

; ----- data are all equal

ScanEquStr_x64_8:
		mov		rax,r8				; RAX <- old number of loops
		ret

; ----- no data

ScanEquStr_x64_9:
		xor		rax,rax				; RAX <- 0, no data
		ret

; ----- result

%assign LOOPLAB	LOOPNUM-1
%rep LOOPNUM-1
ScanEquStr_x64_res_ %+ LOOPLAB %+ :
		dec		rcx
%assign LOOPLAB	LOOPLAB-1
%endrep
ScanEquStr_x64_res_0:
		add		rcx,LOOPNUM			; return correct number of remaining loops
ScanEquStr_x64_res_00:
		mov		rax,r8				; RAX <- old number of loops
		sub		rax,rcx				; RAX <- length of equal data
		ret
%endif

; =============================================================================
;                   SCAN NEQU data string in UP direction
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64 val, r8=s64 len
; output: rax=s64 length of equal data

global ScanNEquStr_x64

%if 0	; 1=use "repne scasq" method, 0=use "mov" method
; In this case "repne scasq" is slower than "mov" method, use "0" flag.

; ----- fill data string using "rep scasq"
; RDI = dst, RAX = val, RCX = loop counter, R8 = old len

ScanNEquStr_x64:
		test	r8,r8				; test R8 len
		jle		ScanNEquStr_x64_9	; no data
		push	rdi					; push RDI
		mov		rdi,rcx				; RDI <- dst
		mov		rax,rdx				; RAX <- val
		mov		rcx,r8				; RCX <- len
		repne	scasq				; scan string
		pop		rdi					; pop RDI
		jne		ScanNEquStr_x64_4	; operation break with NEQU
		inc		rcx					; add last loop back
ScanNEquStr_x64_4:
		mov		rax,r8				; RAX <- old len
		sub		rax,rcx				; RAX <- length of equal data
		ret

ScanNEquStr_x64_9:
		xor		rax,rax				; RAX <- 0, no data
		ret
		
%else	; method "mov"

; ----- prepare registers to big loop
; R9 = dst, RDX = val, RCX = loop counter, R8 = old len

ScanNEquStr_x64:
		test	r8,r8				; test R8 len
		jle		ScanNEquStr_x64_9	; no data
		mov		r9,rcx				; R9 <- dst
		mov		rcx,r8				; RCX <- len
		sub		rcx,LOOPNUM			; RCX <- check number of big loops
		jl		ScanNEquStr_x64_4	; no big loop

; ----- big loop
		
ScanNEquStr_x64_2:
%assign LOOPINX 0
%assign LOOPLAB	0
%rep LOOPNUM
		cmp		rdx,[r9+LOOPINX]
		je		ScanNEquStr_x64_res_ %+ LOOPLAB
%assign LOOPINX LOOPINX+8
%assign LOOPLAB	LOOPLAB+1
%endrep
		add		r9,8*LOOPNUM		; shift dst
		sub		rcx,LOOPNUM			; loop counter
		jge		ScanNEquStr_x64_2	; loop next step

; ----- prepare registers to small loop

ScanNEquStr_x64_4:
		add		rcx,LOOPNUM			; return correct number of remaining loops
		jz		ScanNEquStr_x64_8	; no small loop

; ----- small loop

ScanNEquStr_x64_6:
		cmp		rdx,[r9]
		je		ScanNEquStr_x64_res_00
		add		r9,8				; shift dst
		loop	ScanNEquStr_x64_6,rcx	; loop next step

; ----- data are all not equal

ScanNEquStr_x64_8:
		mov		rax,r8				; RAX <- old number of loops
		ret

; ----- no data

ScanNEquStr_x64_9:
		xor		rax,rax				; RAX <- 0, no data
		ret

; ----- result

%assign LOOPLAB	LOOPNUM-1
%rep LOOPNUM-1
ScanNEquStr_x64_res_ %+ LOOPLAB %+ :
		dec		rcx
%assign LOOPLAB	LOOPLAB-1
%endrep
ScanNEquStr_x64_res_0:
		add		rcx,LOOPNUM			; return correct number of remaining loops
ScanNEquStr_x64_res_00:
		mov		rax,r8				; RAX <- old number of loops
		sub		rax,rcx				; RAX <- length of equal data
		ret
%endif

; =============================================================================
;                       SHIFT data string left
; =============================================================================
; inputs: rcx=u64 carry, edx=shift, r8=u64* dst, r9=u64* src, [rsp+28h]=s64 len
; output: rax=u64 carry

global LShiftStr_x64

; ----- prepare registers to big loop
; RAX, RDX, R11 = temporary, R8 = dst, R9 = src, CL = shift, R10 = loop counter, [rsp+28h] orig. len

LShiftStr_x64:
		mov		[rsp+8],rcx			; push carry
		mov		cl,dl				; CL <- shift
		mov		r10,[rsp+28h]		; R10 <- len
		test	r10,r10				; check len
		jle		LShiftStr_x64_9		; no data
		dec		r10					; R10 <- len-1
		lea		r8,[r8+8*r10]		; R8 <- dst last entry
		lea		r9,[r9+8*r10]		; R9 <- src last entry		
		shr		r10,LOOPBIT			; R10 <- number of big loops
		mov		rdx,[r9]			; RDX <- last entry
		push	rdx					; push last entry
		jz		LShiftStr_x64_4		; no big loop

; ----- big loop

LShiftStr_x64_2:
%assign LOOPINX 0
%rep LOOPNUM
		mov		rax,[r9-LOOPINX-8]	; RAX <- low entry
		mov		r11,rax				; R11 <- save low entry
		shld	rdx,rax,cl			; RDX:RAX << shift
		mov		[r8-LOOPINX],rdx	; save shifted entry
		mov		rdx,r11				; RDX <- new high entry
%assign	LOOPINX	LOOPINX+8
%endrep
		sub		r9,8*LOOPNUM		; shift src
		sub		r8,8*LOOPNUM		; shift dst
		dec		r10					; loop counter
		jnz		LShiftStr_x64_2		; loop next step
				
; ----- prepare registers to small loop

LShiftStr_x64_4:
		mov		r10,[rsp+30h]		; R10 <- len
		dec		r10					; R10 <- len-1
		and		r10,LOOPMASK		; R10 <- number of small loops
		jz		LShiftStr_x64_8		; no small loop

; ----- small loop

LShiftStr_x64_6:
		mov		rax,[r9-8]			; RAX <- low entry
		mov		r11,rax				; R11 <- save low entry
		shld	rdx,rax,cl			; RDX:RAX << shift
		mov		[r8],rdx			; save shifted entry
		mov		rdx,r11				; RDX <- new high entry
		sub		r9,8				; shift src
		sub		r8,8				; shift dst
		dec		r10					; decrement loop
		jnz		LShiftStr_x64_6		; loop next stel

; ----- save first entry

LShiftStr_x64_8:
		shl		rdx,cl				; RDX <- shift carry
		mov		rax,[rsp+10h]		; RAX <- carry
		sub		cl,64
		neg		cl					; CL <- shift2 (=BIGBITS-shift)
		shl		rax,cl				; carry << shift2
		shr		rax,cl				; (carry << shift2) >> shift2
		or		rax,rdx				; RAX <- result carry
		mov		[r8],rax			; save first entry
		
; ----- get last carry

		pop		rax					; RAX <- last entry
		shr		rax,cl				; RAX <- carry			
		ret

;------ no data

LShiftStr_x64_9:
		mov		rax,[rsp+8]			; RAX <- carry
		ret

; =============================================================================
;                       SHIFT data string right
; =============================================================================
; inputs: rcx=u64 carry, edx=shift, r8=u64* dst, r9=u64* src, [rsp+28h]=s64 len
; output: rax=u64 carry

global RShiftStr_x64

; ----- prepare registers to big loop
; RAX, RDX, R11 = temporary, R8 = dst, R9 = src, CL = shift, R10 = loop counter, [rsp+28h] orig. len

RShiftStr_x64:
		mov		[rsp+8],rcx			; push carry
		mov		cl,dl				; CL <- shift
		mov		r10,[rsp+28h]		; R10 <- len
		test	r10,r10				; check len
		jle		RShiftStr_x64_9		; no data
		dec		r10					; R10 <- len-1
		shr		r10,LOOPBIT			; R10 <- number of big loops
		mov		rdx,[r9]			; RDX <- last entry
		push	rdx					; push last entry
		jz		RShiftStr_x64_4		; no big loop

; ----- big loop

RShiftStr_x64_2:
%assign LOOPINX 0
%rep LOOPNUM
		mov		rax,[r9+LOOPINX+8]	; RAX <- high entry
		mov		r11,rax				; R11 <- save high entry
		shrd	rax,rdx,cl			; RAX:RDX >> shift
		mov		[r8+LOOPINX],rdx	; save shifted entry
		mov		rdx,r11				; RDX <- new low entry
%assign	LOOPINX	LOOPINX+8
%endrep
		add		r9,8*LOOPNUM		; shift src
		add		r8,8*LOOPNUM		; shift dst
		dec		r10					; loop counter
		jnz		RShiftStr_x64_2		; loop next step
				
; ----- prepare registers to small loop

RShiftStr_x64_4:
		mov		r10,[rsp+30h]		; R10 <- len
		dec		r10					; R10 <- len-1
		and		r10,LOOPMASK		; R10 <- number of small loops
		jz		RShiftStr_x64_8		; no small loop

; ----- small loop

RShiftStr_x64_6:
		mov		rax,[r9+8]			; RAX <- high entry
		mov		r11,rax				; R11 <- save high entry
		shrd	rax,rdx,cl			; RAX:RDX >> shift
		mov		[r8],rdx			; save shifted entry
		mov		rdx,r11				; RDX <- new low entry
		add		r9,8				; shift src
		add		r8,8				; shift dst
		dec		r10					; decrement loop
		jnz		RShiftStr_x64_6		; loop next stel

; ----- save first entry

RShiftStr_x64_8:
		shr		rdx,cl				; RDX <- shift carry
		mov		rax,[rsp+10h]		; RAX <- carry
		sub		cl,64
		neg		cl					; CL <- shift2 (=BIGBITS-shift)
		shl		rax,cl				; carry << shift2
		or		rax,rdx				; RAX <- result carry
		mov		[r8],rax			; save first entry
		
; ----- get last carry

		pop		rax					; RAX <- last entry
		shl		rax,cl				; RAX <- carry << shift2
		shr		rax,cl				; (carry << shift2) >> shift2
		ret

;------ no data

RShiftStr_x64_9:
		mov		rax,[rsp+8]			; RAX <- carry
		ret

; =============================================================================
;                           COMPARE EQU data string
; =============================================================================
; inputs: rcx=u64* num1, rdx=u64* num2, r8=s64 len
; output: rax=s64 length of equal data

global CompEquStr_x64

%if 0	; 1=use "repe cmpsq" method, 0=use "cmp" method
; In this case "repe cmpsq" is slower than "cmp" method, use "0" flag.

; ----- fill data string using "rep cmpsq"
; RDI = num1, RSI = num2, RCX = loop counter, R8 = old len

CompEquStr_x64:
		test	r8,r8				; test R8 len
		jle		CompEquStr_x64_9	; no data
		push	rsi					; push RSI
		push	rdi					; push RDI
		mov		rdi,rcx				; RDI <- num1
		mov		rsi,rdx				; RSI <- num2
		mov		rcx,r8				; RCX <- len
		repe	cmpsq				; scan string
		pop		rdi					; pop RDI
		pop		rsi					; pop RSI
		je		CompEquStr_x64_4	; operation break with EQU
		inc		rcx					; add last loop back
CompEquStr_x64_4:
		mov		rax,r8				; RAX <- old len
		sub		rax,rcx				; RAX <- length of equal data
		ret

CompEquStr_x64_9:
		xor		rax,rax				; RAX <- 0, no data
		ret
		
%else	; method "cmp"

; ----- prepare registers to big loop
; RAX = temporary, R9 = num1, RDX = num2, RCX = loop counter, R8 = old len

CompEquStr_x64:
		test	r8,r8				; check len
		jle		CompEquStr_x64_9	; no data
		mov		r9,rcx				; R9 <- num1
		mov		rcx,r8				; RCX <- len
		sub		rcx,LOOPNUM			; RCX <- check number of big loops
		jl		CompEquStr_x64_4	; no big loop

; ----- big loop
		
CompEquStr_x64_2:
%assign LOOPINX 0
%assign LOOPLAB	0
%rep LOOPNUM
		mov		rax,[r9+LOOPINX]
		cmp		rax,[rdx+LOOPINX]
		jne		CompEquStr_x64_res_ %+ LOOPLAB
%assign LOOPINX LOOPINX+8
%assign LOOPLAB	LOOPLAB+1
%endrep
		add		r9,8*LOOPNUM		; shift num1
		add		rdx,8*LOOPNUM		; shift num2
		sub		rcx,LOOPNUM			; loop counter
		jge		CompEquStr_x64_2	; loop next step

; ----- prepare registers to small loop

CompEquStr_x64_4:
		add		rcx,LOOPNUM			; return correct number of remaining loops
		jz		CompEquStr_x64_8	; no small loop

; ----- small loop

CompEquStr_x64_6:
		mov		rax,[r9]
		cmp		rax,[rdx]
		jne		CompEquStr_x64_res_00
		add		r9,8				; shift num1
		add		rdx,8				; shift num2
		loop	CompEquStr_x64_6,rcx ; loop next step
		
; ----- num1 == num1

CompEquStr_x64_8:
		mov		rax,r8				; RAX <- len
		ret

; ----- no data

CompEquStr_x64_9:
		xor		rax,rax				; RAX <- 0 no data
		ret

; ----- result

%assign LOOPLAB	LOOPNUM-1
%rep LOOPNUM-1
CompEquStr_x64_res_ %+ LOOPLAB %+ :
		dec		ecx					; loop counter
%assign LOOPLAB	LOOPLAB-1
%endrep
CompEquStr_x64_res_0:
		add		rcx,LOOPNUM			; return correct number of remaining loops
CompEquStr_x64_res_00:
		mov		rax,r8				; RAX <- old number of loops
		sub		rax,rcx				; RAX <- length of equal data
		ret
%endif

; =============================================================================
;                           COMPARE data string
; =============================================================================
; inputs: rcx=u64* num1, rdx=u64* num2, r8=s64 len
; output: rax= -1 num1<num2, 0 num1=num2, 1 num1>num2

global CompStr_x64

; ----- prepare registers to big loop
; RAX = temporary, R9 = num1, RDX = num2, RCX = loop counter, R8 = old len

CompStr_x64:
		test	r8,r8				; check len
		jle		CompStr_x64_8		; no data
		lea		r9,[rcx+r8*8-8]		; R9 <- num1 last entry
		lea		rdx,[rdx+r8*8-8]	; RDX <- num2 last entry
		mov		rcx,r8				; RCX <- len
		shr		rcx,LOOPBIT			; RCX <- number of big loops
		jz		CompStr_x64_4		; no big loop

; ----- big loop
		
CompStr_x64_2:
%assign LOOPINX 0
%assign LOOPLAB	0
%rep LOOPNUM
		mov		rax,[r9-LOOPINX]
		cmp		rax,[rdx-LOOPINX]
		jne		CompStr_x64_res_ %+ LOOPLAB
%assign LOOPINX LOOPINX+8
%assign LOOPLAB	LOOPLAB+1
%endrep
		sub		rdx,8*LOOPNUM		; shift src
		sub		r9,8*LOOPNUM		; shift dst
		dec		rcx					; loop counter
		jnz		CompStr_x64_2		; loop next step

; ----- prepare registers to small loop

CompStr_x64_4:
		mov		rcx,r8				; RCX <- len
		and		rcx,LOOPMASK		; RCX <- number of small loops
		jz		CompStr_x64_8		; no small loop

; ----- small loop

CompStr_x64_6:
		mov		rax,[r9]
		cmp		rax,[rdx]
		jne		CompStr_x64_res_0
		sub		rdx,8				; shift src
		sub		r9,8				; shift dst
		loop	CompStr_x64_6,rcx	; loop next step

; ----- num1 == num1

CompStr_x64_8:
		xor		rax,rax				; RAX <- 0 (num1 == num2)
		ret

; ----- result

%assign LOOPLAB	LOOPNUM-1
%rep LOOPNUM-1
CompStr_x64_res_ %+ LOOPLAB %+ :
		sub		rdx,8				; shift src
		sub		r9,8				; shift dst
%assign LOOPLAB	LOOPLAB-1
%endrep

CompStr_x64_res_0:
		xor		rax,rax				; RAX <- 0
		inc		rax					; RAX <- 1 (num1 > num2)
		mov		rcx,[r9]			; RCX <- num1
		cmp		rcx,[rdx]			; compare with num2
		ja		CompStr_x64_res_GR	; num1 > num2
		neg		rax					; RAX <- -1 (num1 < num2)
CompStr_x64_res_GR:
		ret

; =============================================================================
;                          NOT data string (1 string)
; =============================================================================
; inputs: rcx=u64* dst, rdx=s64 len

global NotStr_x64

; ----- prepare registers to big loop
; RAX = temporary, R8 = dst, RCX = loop counter, RDX = old len

NotStr_x64:
		test	rdx,rdx				; check len
		jle		NotStr_x64_8		; no data
		mov		r8,rcx				; R8 <- dst
		mov		rcx,rdx				; RCX <- len
		shr		rcx,LOOPBIT			; RCX <- number of big loops
		jz		NotStr_x64_4		; no big loop

; ----- big loop
		
NotStr_x64_2:
%assign LOOPINX 0
%rep LOOPNUM
		mov		rax,[r8+LOOPINX]
		not		rax
		mov		[r8+LOOPINX],rax
%assign LOOPINX LOOPINX+8
%endrep
		add		r8,8*LOOPNUM		; shift dst
		dec		rcx					; loop counter
		jnz		NotStr_x64_2		; loop next step

; ----- prepare registers to small loop

NotStr_x64_4:
		mov		rcx,rdx				; RCX <- len
		and		rcx,LOOPMASK		; RCX <- number of small loops
		jz		NotStr_x64_8		; no small loop

; ----- small loop

NotStr_x64_6:
		mov		rax,[r8]
		not		rax
		mov		[r8],rax
		add		r8,8				; shift dst
		loop	NotStr_x64_6,rcx	; loop next step

NotStr_x64_8:
		ret

; =============================================================================
;                          NOT data string (2 strings)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src, r8=s64 len

global NotStr2_x64

; ----- prepare registers to big loop
; RAX = temporary, R9 = dst, RDX = src, RCX = loop counter, R8 = old len

NotStr2_x64:
		test	r8,r8				; check len
		jle		NotStr2_x64_8		; no data
		mov		r9,rcx				; R9 <- dst
		mov		rcx,r8				; RCX <- len
		shr		rcx,LOOPBIT			; RCX <- number of big loops
		jz		NotStr2_x64_4		; no big loop

; ----- big loop
		
NotStr2_x64_2:
%assign LOOPINX 0
%rep LOOPNUM
		mov		rax,[rdx+LOOPINX]
		not		rax
		mov		[r9+LOOPINX],rax
%assign	LOOPINX	LOOPINX+8
%endrep
		add		rdx,8*LOOPNUM		; shift src
		add		r9,8*LOOPNUM		; shift dst
		dec		rcx					; loop counter
		jnz		NotStr2_x64_2		; loop next step

; ----- prepare registers to small loop

NotStr2_x64_4:
		mov		rcx,r8				; RCX <- len
		and		rcx,LOOPMASK		; RCX <- number of small loops
		jz		NotStr2_x64_8		; no small loop

; ----- small loop

NotStr2_x64_6:
		mov		rax,[rdx]
		not		rax
		mov		[r9],rax
		add		rdx,8				; shift src
		add		r9,8				; shift dst
		loop	NotStr2_x64_6,rcx	; loop next step

NotStr2_x64_8:
		ret

; =============================================================================
;                          AND data string (1 string)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src, r8=s64 len

global AndStr_x64

; ----- prepare registers to big loop
; RAX = temporary, R9 = dst, RDX = src, RCX = loop counter, R8 = old len

AndStr_x64:
		test	r8,r8				; check len
		jle		AndStr_x64_8		; no data
		mov		r9,rcx				; R9 <- dst
		mov		rcx,r8				; RCX <- len
		shr		rcx,LOOPBIT			; RCX <- number of big loops
		jz		AndStr_x64_4		; no big loop
		
; ----- big loop

AndStr_x64_2:
%assign LOOPINX 0
%rep LOOPNUM
		mov		rax,[rdx+LOOPINX]	; RAX <- src
		and		[r9+LOOPINX],rax	; dst &= src
%assign LOOPINX LOOPINX+8
%endrep
		add		rdx,8*LOOPNUM		; shift src
		add		r9,8*LOOPNUM		; shift dst
		dec		rcx					; loop counter
		jnz		AndStr_x64_2		; loop next step
		
; ----- prepare registers to small loop

AndStr_x64_4:
		mov		rcx,r8				; RCX <- len
		and		rcx,LOOPMASK		; RCX <- number of small loops
		jz		AndStr_x64_8		; no small loop

; ----- small loop

AndStr_x64_6:
		mov		rax,[rdx]			; RAX <- src
		and		[r9],rax			; dst &= src
		add		rdx,8				; shift src
		add		r9,8				; shift dst
		dec		rcx					; loop counter
		jnz		AndStr_x64_6		; loop next step

AndStr_x64_8:
		ret

; =============================================================================
;                          AND data string (2 strings)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src1, r8=u64* src2, r9=s64 len

global AndStr2_x64

; ----- prepare registers to big loop
; RAX = temporary, R10 = dst, RDX = src1, R8 = src2, RCX = loop counter, R9 = old len

AndStr2_x64:
		test	r9,r9				; check len
		jle		AndStr2_x64_8		; no data
		mov		r10,rcx				; R10 <- dst
		mov		rcx,r9				; RCX <- len
		shr		rcx,LOOPBIT			; RCX <- number of big loops
		jz		AndStr2_x64_4		; no big loop
		
; ----- big loop

AndStr2_x64_2:
%assign LOOPINX 0
%rep LOOPNUM
		mov		rax,[rdx+LOOPINX]	; RAX <- src1
		and		rax,[r8+LOOPINX]	; RAX <- src1 & src2
		mov		[r10+LOOPINX],rax	; save result
%assign LOOPINX LOOPINX+8
%endrep
		add		rdx,8*LOOPNUM		; shift src1
		add		r8,8*LOOPNUM		; shift src2
		add		r10,8*LOOPNUM		; shift dst
		dec		rcx					; loop counter
		jnz		AndStr2_x64_2		; loop next step
		
; ----- prepare registers to small loop

AndStr2_x64_4:
		mov		rcx,r9				; RCX <- len
		and		rcx,LOOPMASK		; RCX <- number of small loops
		jz		AndStr2_x64_8		; no small loop

; ----- small loop

AndStr2_x64_6:
		mov		rax,[rdx]			; RAX <- src1
		and		rax,[r8]			; RAX <- src1 & src2
		mov		[r10],rax			; save result
		add		rdx,8				; shift src1
		add		r8,8				; shift src2
		add		r10,8				; shift dst
		dec		rcx					; loop counter
		jnz		AndStr2_x64_6		; loop next step

AndStr2_x64_8:
		ret

; =============================================================================
;                          OR data string (1 string)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src, r8=s64 len

global OrStr_x64

; ----- prepare registers to big loop
; RAX = temporary, R9 = dst, RDX = src, RCX = loop counter, R8 = old len

OrStr_x64:
		test	r8,r8				; check len
		jle		OrStr_x64_8			; no data
		mov		r9,rcx				; R9 <- dst
		mov		rcx,r8				; RCX <- len
		shr		rcx,LOOPBIT			; RCX <- number of big loops
		jz		OrStr_x64_4		; no big loop
		
; ----- big loop

OrStr_x64_2:
%assign LOOPINX 0
%rep LOOPNUM
		mov		rax,[rdx+LOOPINX]	; RAX <- src
		or		[r9+LOOPINX],rax	; dst |= src
%assign LOOPINX LOOPINX+8
%endrep
		add		rdx,8*LOOPNUM		; shift src
		add		r9,8*LOOPNUM		; shift dst
		dec		rcx					; loop counter
		jnz		OrStr_x64_2		; loop next step
		
; ----- prepare registers to small loop

OrStr_x64_4:
		mov		rcx,r8				; RCX <- len
		and		rcx,LOOPMASK		; RCX <- number of small loops
		jz		OrStr_x64_8		; no small loop

; ----- small loop

OrStr_x64_6:
		mov		rax,[rdx]			; RAX <- src
		or		[r9],rax			; dst |= src
		add		rdx,8				; shift src
		add		r9,8				; shift dst
		dec		rcx					; loop counter
		jnz		OrStr_x64_6		; loop next step

OrStr_x64_8:
		ret

; =============================================================================
;                          OR data string (2 strings)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src1, r8=u64* src2, r9=s64 len

global OrStr2_x64

; ----- prepare registers to big loop
; RAX = temporary, R10 = dst, RDX = src1, R8 = src2, RCX = loop counter, R9 = old len

OrStr2_x64:
		test	r9,r9				; check len
		jle		OrStr2_x64_8		; no data
		mov		r10,rcx				; R10 <- dst
		mov		rcx,r9				; RCX <- len
		shr		rcx,LOOPBIT			; RCX <- number of big loops
		jz		OrStr2_x64_4		; no big loop
		
; ----- big loop

OrStr2_x64_2:
%assign LOOPINX 0
%rep LOOPNUM
		mov		rax,[rdx+LOOPINX]	; RAX <- src1
		or		rax,[r8+LOOPINX]	; RAX <- src1 | src2
		mov		[r10+LOOPINX],rax	; save result
%assign LOOPINX LOOPINX+8
%endrep
		add		rdx,8*LOOPNUM		; shift src1
		add		r8,8*LOOPNUM		; shift src2
		add		r10,8*LOOPNUM		; shift dst
		dec		rcx					; loop counter
		jnz		OrStr2_x64_2		; loop next step
		
; ----- prepare registers to small loop

OrStr2_x64_4:
		mov		rcx,r9				; RCX <- len
		and		rcx,LOOPMASK		; RCX <- number of small loops
		jz		OrStr2_x64_8		; no small loop

; ----- small loop

OrStr2_x64_6:
		mov		rax,[rdx]			; RAX <- src1
		or		rax,[r8]			; RAX <- src1 | src2
		mov		[r10],rax			; save result
		add		rdx,8				; shift src1
		add		r8,8				; shift src2
		add		r10,8				; shift dst
		dec		rcx					; loop counter
		jnz		OrStr2_x64_6		; loop next step

OrStr2_x64_8:
		ret

; =============================================================================
;                          XOR data string (1 string)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src, r8=s64 len

global XorStr_x64

; ----- prepare registers to big loop
; RAX = temporary, R9 = dst, RDX = src, RCX = loop counter, R8 = old len

XorStr_x64:
		test	r8,r8				; check len
		jle		XorStr_x64_8		; no data
		mov		r9,rcx				; R9 <- dst
		mov		rcx,r8				; RCX <- len
		shr		rcx,LOOPBIT			; RCX <- number of big loops
		jz		XorStr_x64_4		; no big loop
		
; ----- big loop

XorStr_x64_2:
%assign LOOPINX 0
%rep LOOPNUM
		mov		rax,[rdx+LOOPINX]	; RAX <- src
		xor		[r9+LOOPINX],rax	; dst ^= src
%assign LOOPINX LOOPINX+8
%endrep
		add		rdx,8*LOOPNUM		; shift src
		add		r9,8*LOOPNUM		; shift dst
		dec		rcx					; loop counter
		jnz		XorStr_x64_2		; loop next step
		
; ----- prepare registers to small loop

XorStr_x64_4:
		mov		rcx,r8				; RCX <- len
		and		rcx,LOOPMASK		; RCX <- number of small loops
		jz		XorStr_x64_8		; no small loop

; ----- small loop

XorStr_x64_6:
		mov		rax,[rdx]			; RAX <- src
		xor		[r9],rax			; dst ^= src
		add		rdx,8				; shift src
		add		r9,8				; shift dst
		dec		rcx					; loop counter
		jnz		XorStr_x64_6		; loop next step

XorStr_x64_8:
		ret

; =============================================================================
;                          XOR data string (2 strings)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src1, r8=u64* src2, r9=s64 len

global XorStr2_x64

; ----- prepare registers to big loop
; RAX = temporary, R10 = dst, RDX = src1, R8 = src2, RCX = loop counter, R9 = old len

XorStr2_x64:
		test	r9,r9				; check len
		jle		XorStr2_x64_8		; no data
		mov		r10,rcx				; R10 <- dst
		mov		rcx,r9				; RCX <- len
		shr		rcx,LOOPBIT			; RCX <- number of big loops
		jz		XorStr2_x64_4		; no big loop
		
; ----- big loop

XorStr2_x64_2:
%assign LOOPINX 0
%rep LOOPNUM
		mov		rax,[rdx+LOOPINX]	; RAX <- src1
		xor		rax,[r8+LOOPINX]	; RAX <- src1 ^ src2
		mov		[r10+LOOPINX],rax	; save result
%assign LOOPINX LOOPINX+8
%endrep
		add		rdx,8*LOOPNUM		; shift src1
		add		r8,8*LOOPNUM		; shift src2
		add		r10,8*LOOPNUM		; shift dst
		dec		rcx					; loop counter
		jnz		XorStr2_x64_2		; loop next step
		
; ----- prepare registers to small loop

XorStr2_x64_4:
		mov		rcx,r9				; RCX <- len
		and		rcx,LOOPMASK		; RCX <- number of small loops
		jz		XorStr2_x64_8		; no small loop

; ----- small loop

XorStr2_x64_6:
		mov		rax,[rdx]			; RAX <- src1
		xor		rax,[r8]			; RAX <- src1 ^ src2
		mov		[r10],rax			; save result
		add		rdx,8				; shift src1
		add		r8,8				; shift src2
		add		r10,8				; shift dst
		dec		rcx					; loop counter
		jnz		XorStr2_x64_6		; loop next step

XorStr2_x64_8:
		ret
		
; =============================================================================
;                          INC data string (1 string)
; =============================================================================
; inputs: rcx=u64* dst, rdx=s64 len
; output: rax=carry flag 0 or 1

global IncStr_x64

; ----- prepare registers to big loop
; R8 = dst, RCX = loop counter, RDX = old len

IncStr_x64:
		test	rdx,rdx				; check len
		jle		IncStr_x64_8		; no data
		mov		r8,rcx				; R8 <- dst
		mov		rcx,rdx				; RCX <- len
		shr		rcx,LOOPBIT			; RCX <- number of big loops
		jz		IncStr_x64_4		; no big loop

; ----- big loop
		
IncStr_x64_2:
		add		qword [r8],1		; add first carry
%assign LOOPINX 0
%rep LOOPNUM-1
%assign LOOPINX LOOPINX+8
		adc		qword [r8+LOOPINX],0 ; add carry
%endrep
		jnc		IncStr_x64_9		; no carry
		add		r8,8*LOOPNUM		; shift dst
		dec		rcx					; loop counter
		jnz		IncStr_x64_2		; loop next step

; ----- prepare registers to small loop

IncStr_x64_4:
		mov		rcx,rdx				; RCX <- len
		and		rcx,LOOPMASK		; RCX <- number of small loops
		jz		IncStr_x64_8		; no small loop

; ----- small loop

IncStr_x64_6:
		inc		qword [r8]			; add carry
		jnz		IncStr_x64_9		; no carry
		add		r8,8				; shift dst
		loop	IncStr_x64_6,rcx	; loop next step

; ----- return with carry set

IncStr_x64_8:
		mov		rax,1				; RAX <- 1, carry
		ret
		
; ----- return with carry not set

IncStr_x64_9:
		xor		rax,rax				; RAX <- 0, no carry
		ret

; =============================================================================
;                          INC data string (2 strings)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src, r8=s64 len
; output: rax=carry flag 0 or 1

global IncStr2_x64

; ----- prepare registers to big loop
; RAX = carry, R10 = temporary, R9 = dst, RDX = src, R8 = loop counter

IncStr2_x64:
		test	r8,r8				; check len
		jle		IncStr2_x64_7		; no data
		mov		r9,rcx				; R9 <- dst
		mov		rcx,r8				; RCX <- len
		shr		rcx,LOOPBIT			; RCX <- number of big loops
		jz		IncStr2_x64_4		; no big loop

; ----- big loop
		
IncStr2_x64_2:
		mov		r10,[rdx]
		add		r10,1				; add firt carry
		mov		[r9],r10
%assign LOOPINX 0
%rep LOOPNUM-1
%assign	LOOPINX	LOOPINX+8
		mov		r10,[rdx+LOOPINX]
		adc		r10,0				; add carry
		mov		[r9+LOOPINX],r10
%endrep
		jnc		IncStr2_x64_8		; no carry, copy rest of data
		add		rdx,8*LOOPNUM		; shift src
		add		r9,8*LOOPNUM		; shift dst
		dec		rcx					; loop counter
		jnz		IncStr2_x64_2		; loop next step

; ----- prepare registers to small loop

IncStr2_x64_4:
		mov		rcx,r8				; RCX <- len
		and		rcx,LOOPMASK		; RCX <- number of small loops
		jz		IncStr2_x64_7		; no small loop
		or		rax,1				; RAX bit 0 <- set carry

; ----- small loop

IncStr2_x64_6:
		rcr		rax,1				; CY <- pop carry
		mov		rax,[rdx]
		adc		rax,0				; add carry
		mov		[r9],rax
		rcl		rax,1				; RAX bit 0 <- push carry
		add		rdx,8				; shift src
		add		r9,8				; shift dst
		loop	IncStr2_x64_6,rcx	; loop next step
		and		rax,1				; carry
		ret
		
; ----- return with carry set

IncStr2_x64_7:
		mov		rax,1				; RAX <- 1, carry
		ret
		
; ----- copy rest of data		
		
IncStr2_x64_8:
		add		rdx,8*LOOPNUM		; shift src
		dec		rcx					; loop counter
		shl		rcx,LOOPBIT			; RCX <- rest number of big loops
		and		r8,LOOPMASK			; R8 <- number of small loops
		or		r8,rcx				; R8 <- rest of len
		lea		rcx,[r9+8*LOOPNUM]	; RCX <- dst
		sub		rsp,28h				; shadow space
		call	CopyStr_x64			; copy rest of string
		add		rsp,28h
		
; ----- return with carry not set

IncStr2_x64_9:
		xor		rax,rax				; RAX <- 0, no carry
		ret
		
; =============================================================================
;                          DEC data string (1 string)
; =============================================================================
; inputs: rcx=u64* dst, rdx=s64 len
; output: rax=carry flag 0 or 1

global DecStr_x64

; ----- prepare registers to big loop
; R8 = dst, RCX = loop counter, RDX = old len

DecStr_x64:
		test	rdx,rdx				; check len
		jle		DecStr_x64_8		; no data
		mov		r8,rcx				; R8 <- dst
		mov		rcx,rdx				; RCX <- len
		shr		rcx,LOOPBIT			; RCX <- number of big loops
		jz		DecStr_x64_4		; no big loop

; ----- big loop
		
DecStr_x64_2:
		sub		qword [r8],1		; sub first carry
%assign LOOPINX 0
%rep LOOPNUM-1
%assign LOOPINX LOOPINX+8
		sbb		qword [r8+LOOPINX],0 ; sub carry
%endrep
		jnc		DecStr_x64_9		; no carry
		add		r8,8*LOOPNUM		; shift dst
		dec		rcx					; loop counter
		jnz		DecStr_x64_2		; loop next step

; ----- prepare registers to small loop

DecStr_x64_4:
		mov		rcx,rdx				; RCX <- len
		and		rcx,LOOPMASK		; RCX <- number of small loops
		jz		DecStr_x64_8		; no small loop

; ----- small loop

DecStr_x64_6:
		sub		qword [r8],1		; sub carry
		jnc		DecStr_x64_9		; no carry
		add		r8,8				; shift dst
		loop	DecStr_x64_6,rcx	; loop next step

; ----- return with carry set

DecStr_x64_8:
		mov		rax,1				; RAX <- 1, carry
		ret
		
; ----- return with carry not set

DecStr_x64_9:
		xor		rax,rax				; RAX <- 0, no carry
		ret

; =============================================================================
;                          DEC data string (2 strings)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src, r8=s64 len
; output: rax=carry flag 0 or 1

global DecStr2_x64

; ----- prepare registers to big loop
; RAX = carry, R10 = temporary, R9 = dst, RDX = src, R8 = loop counter

DecStr2_x64:
		test	r8,r8				; check len
		jle		DecStr2_x64_7		; no data
		mov		r9,rcx				; R9 <- dst
		mov		rcx,r8				; RCX <- len
		shr		rcx,LOOPBIT			; RCX <- number of big loops
		jz		DecStr2_x64_4		; no big loop

; ----- big loop
		
DecStr2_x64_2:
		mov		r10,[rdx]
		sub		r10,1				; sub firt carry
		mov		[r9],r10
%assign LOOPINX 0
%rep LOOPNUM-1
%assign	LOOPINX	LOOPINX+8
		mov		r10,[rdx+LOOPINX]
		sbb		r10,0				; sub carry
		mov		[r9+LOOPINX],r10
%endrep
		jnc		DecStr2_x64_8		; no carry, copy rest of data
		add		rdx,8*LOOPNUM		; shift src
		add		r9,8*LOOPNUM		; shift dst
		dec		rcx					; loop counter
		jnz		DecStr2_x64_2		; loop next step

; ----- prepare registers to small loop

DecStr2_x64_4:
		mov		rcx,r8				; RCX <- len
		and		rcx,LOOPMASK		; RCX <- number of small loops
		jz		DecStr2_x64_7		; no small loop
		or		rax,1				; RAX bit 0 <- set carry

; ----- small loop

DecStr2_x64_6:
		rcr		rax,1				; CY <- pop carry
		mov		rax,[rdx]
		sbb		rax,0				; add carry
		mov		[r9],rax
		rcl		rax,1				; RAX bit 0 <- push carry
		add		rdx,8				; shift src
		add		r9,8				; shift dst
		loop	DecStr2_x64_6,rcx	; loop next step
		and		rax,1				; carry
		ret
		
; ----- return with carry set

DecStr2_x64_7:
		mov		rax,1				; RAX <- 1, carry
		ret
		
; ----- copy rest of data		
		
DecStr2_x64_8:
		add		rdx,8*LOOPNUM		; shift src
		dec		rcx					; loop counter
		shl		rcx,LOOPBIT			; RCX <- rest number of big loops
		and		r8,LOOPMASK			; R8 <- number of small loops
		or		r8,rcx				; R8 <- rest of len
		lea		rcx,[r9+8*LOOPNUM]	; RCX <- dst
		sub		rsp,28h				; shadow space
		call	CopyStr_x64			; copy rest of string
		add		rsp,28h
		
; ----- return with carry not set

DecStr2_x64_9:
		xor		rax,rax				; RAX <- 0, no carry
		ret

; =============================================================================
;                         ADD data string (1 string)
; =============================================================================
; inputs: rcx=u64 carry, rdx=u64* dst, r8=u64* src, r9=s64 len
; output: rax=u64 carry

global AddStr_x64

; ----- prepare registers to big loop
; RAX = temporary, RDX = dst, R8 = src, RCX = loop counter, R9 = old len

AddStr_x64:
		mov		rax,rcx				; RAX <- carry
		test	r9,r9				; check len
		jle		AddStr_x64_8		; no data
		mov		rcx,r9				; RCX <- len
		shr		rcx,LOOPBIT			; RCX <- number of big loops
		jz		AddStr_x64_4		; no big loop
		
; ----- big loop

AddStr_x64_2:
		rcr		rax,1				; CY <- carry
%assign LOOPINX 0
%rep	LOOPNUM
		mov		rax,[r8+LOOPINX]	; RAX <- [src]
		adc		[rdx+LOOPINX],rax	; add src to dst
%assign	LOOPINX	LOOPINX+8
%endrep
		rcl		rax,1				; RAX bit 0 <- carry
		add		r8,8*LOOPNUM		; shift src
		add		rdx,8*LOOPNUM		; shift dst
		dec		rcx					; loop counter
		jnz		AddStr_x64_2		; loop next step
		
; ----- prepare registers to small loop

AddStr_x64_4:
		mov		rcx,r9				; RCX <- len
		and		rcx,LOOPMASK		; RCX <- number of small loops
		jz		AddStr_x64_8		; no small loop

; ----- small loop

AddStr_x64_6:
		rcr		rax,1				; CY <- carry
		mov		rax,[r8]			; RAX <- [src]
		adc		[rdx],rax			; save result
		rcl		rax,1				; RAX bit 0 <- carry
		add		r8,8				; shift src
		add		rdx,8				; shift dst
		loop	AddStr_x64_6,rcx	; loop next step

; ----- carry

AddStr_x64_8:
		and		rax,1				; RAX <- carry
		ret

; =============================================================================
;                         ADD data string (2 strings)
; =============================================================================
; inputs: rcx=u64 carry, rdx=u64* dst, r8=u64* src1, r9=u64* src2, [rsp+28h]=s64 len
; output: rax=u64 carry

global AddStr2_x64

; ----- prepare registers to big loop
; RAX = temporary, RDX = dst, R8 = src1, R9 = src2, RCX = loop counter, [rsp+28h] = old len

AddStr2_x64:
		mov		rax,rcx				; RAX <- carry
		mov		rcx,[rsp+28h]		; RCX <- len
		test	rcx,rcx				; check len
		jle		AddStr2_x64_8		; no data
		shr		rcx,LOOPBIT			; RCX <- number of big loops
		jz		AddStr2_x64_4		; no big loop
		
; ----- big loop

AddStr2_x64_2:
		rcr		rax,1				; CY <- carry
%assign LOOPINX 0
%rep	LOOPNUM
		mov		rax,[r8+LOOPINX]	; RAX <- [src1]
		adc		rax,[r9+LOOPINX]	; RAX,CY <- [src1] + [src2] + CY
		mov		[rdx+LOOPINX],rax	; save result
%assign	LOOPINX	LOOPINX+8
%endrep
		rcl		rax,1				; RAX bit 0 <- carry
		add		r8,8*LOOPNUM		; shift src1
		add		r9,8*LOOPNUM		; shift src2
		add		rdx,8*LOOPNUM		; shift dst
		dec		rcx					; loop counter
		jnz		AddStr2_x64_2		; loop next step
		
; ----- prepare registers to small loop

AddStr2_x64_4:
		mov		rcx,[rsp+28h]		; RCX <- len
		and		rcx,LOOPMASK		; RCX <- number of small loops
		jz		AddStr2_x64_8		; no small loop

; ----- small loop

AddStr2_x64_6:
		rcr		rax,1				; CY <- carry
		mov		rax,[r8]			; RAX <- [src1]
		adc		rax,[r9]			; RAX,CY <- [src1] + [src2] + CY
		mov		[rdx],rax			; save result
		rcl		rax,1				; RAX bit 0 <- carry
		add		r8,8				; shift src1
		add		r9,8				; shift src2
		add		rdx,8				; shift dst
		loop	AddStr2_x64_6,rcx	; loop next step

; ----- carry

AddStr2_x64_8:
		and		rax,1				; RAX <- carry
		ret

; =============================================================================
;                         SUB data string (1 string)
; =============================================================================
; inputs: rcx=u64 carry, rdx=u64* dst, r8=u64* src, r9=s64 len
; output: rax=u64 carry

global SubStr_x64

; ----- prepare registers to big loop
; RAX = temporary, RDX = dst, R8 = src, RCX = loop counter, R9 = old len

SubStr_x64:
		mov		rax,rcx				; RAX <- carry
		test	r9,r9				; check len
		jle		SubStr_x64_8		; no data
		mov		rcx,r9				; RCX <- len
		shr		rcx,LOOPBIT			; RCX <- number of big loops
		jz		SubStr_x64_4		; no big loop
		
; ----- big loop

SubStr_x64_2:
		rcr		rax,1				; CY <- carry
%assign LOOPINX 0
%rep	LOOPNUM
		mov		rax,[r8+LOOPINX]	; RAX <- [src]
		sbb		[rdx+LOOPINX],rax	; sub src from dst
%assign	LOOPINX	LOOPINX+8
%endrep
		rcl		rax,1				; RAX bit 0 <- carry
		add		r8,8*LOOPNUM		; shift src
		add		rdx,8*LOOPNUM		; shift dst
		dec		rcx					; loop counter
		jnz		SubStr_x64_2		; loop next step
		
; ----- prepare registers to small loop

SubStr_x64_4:
		mov		rcx,r9				; RCX <- len
		and		rcx,LOOPMASK		; RCX <- number of small loops
		jz		SubStr_x64_8		; no small loop

; ----- small loop

SubStr_x64_6:
		rcr		rax,1				; CY <- carry
		mov		rax,[r8]			; RAX <- [src]
		sbb		[rdx],rax			; save result
		rcl		rax,1				; RAX bit 0 <- carry
		add		r8,8				; shift src
		add		rdx,8				; shift dst
		loop	SubStr_x64_6,rcx	; loop next step

; ----- carry

SubStr_x64_8:
		and		rax,1				; RAX <- carry
		ret

; =============================================================================
;                         SUB data string (2 strings)
; =============================================================================
; inputs: rcx=u64 carry, rdx=u64* dst, r8=u64* src1, r9=u64* src2, [rsp+28h]=s64 len
; output: rax=u64 carry

global SubStr2_x64

; ----- prepare registers to big loop
; RAX = temporary, RDX = dst, R8 = src1, R9 = src2, RCX = loop counter, [rsp+28h] = old len

SubStr2_x64:
		mov		rax,rcx				; RAX <- carry
		mov		rcx,[rsp+28h]		; RCX <- len
		test	rcx,rcx				; check len
		jle		SubStr2_x64_8		; no data
		shr		rcx,LOOPBIT			; RCX <- number of big loops
		jz		SubStr2_x64_4		; no big loop
		
; ----- big loop

SubStr2_x64_2:
		rcr		rax,1				; CY <- carry
%assign LOOPINX 0
%rep	LOOPNUM
		mov		rax,[r8+LOOPINX]	; RAX <- [src1]
		sbb		rax,[r9+LOOPINX]	; RAX,CY <- [src1] - [src2] - CY
		mov		[rdx+LOOPINX],rax	; save result
%assign	LOOPINX	LOOPINX+8
%endrep
		rcl		rax,1				; RAX bit 0 <- carry
		add		r8,8*LOOPNUM		; shift src1
		add		r9,8*LOOPNUM		; shift src2
		add		rdx,8*LOOPNUM		; shift dst
		dec		rcx					; loop counter
		jnz		SubStr2_x64_2		; loop next step
		
; ----- prepare registers to small loop

SubStr2_x64_4:
		mov		rcx,[rsp+28h]		; RCX <- len
		and		rcx,LOOPMASK		; RCX <- number of small loops
		jz		SubStr2_x64_8		; no small loop

; ----- small loop

SubStr2_x64_6:
		rcr		rax,1				; CY <- carry
		mov		rax,[r8]			; RAX <- [src1]
		sbb		rax,[r9]			; RAX,CY <- [src1] - [src2] - CY
		mov		[rdx],rax			; save result
		rcl		rax,1				; RAX bit 0 <- carry
		add		r8,8				; shift src1
		add		r9,8				; shift src2
		add		rdx,8				; shift dst
		loop	SubStr2_x64_6,rcx	; loop next step

; ----- carry

SubStr2_x64_8:
		and		rax,1				; RAX <- carry
		ret

; =============================================================================
;                         INV SUB data string (1 string)
; =============================================================================
; inputs: rcx=u64 carry, rdx=u64* dst, r8=u64* src, r9=s64 len
; output: rax=u64 carry

global InvSubStr_x64

; ----- prepare registers to big loop
; RAX = temporary, RDX = dst, R8 = src, RCX = loop counter, R9 = old len

InvSubStr_x64:
		mov		rax,rcx				; RAX <- carry
		test	r9,r9				; check len
		jle		InvSubStr_x64_8		; no data
		mov		rcx,r9				; RCX <- len
		shr		rcx,LOOPBIT			; RCX <- number of big loops
		jz		InvSubStr_x64_4		; no big loop
		
; ----- big loop

InvSubStr_x64_2:
		rcr		rax,1				; CY <- carry
%assign LOOPINX 0
%rep	LOOPNUM
		mov		rax,[r8+LOOPINX]	; RAX <- [src]
		sbb		rax,[rdx+LOOPINX]	; sub dst from src
		mov		[rdx+LOOPINX],rax	; save result
%assign	LOOPINX	LOOPINX+8
%endrep
		rcl		rax,1				; RAX bit 0 <- carry
		add		r8,8*LOOPNUM		; shift src
		add		rdx,8*LOOPNUM		; shift dst
		dec		rcx					; loop counter
		jnz		InvSubStr_x64_2		; loop next step
		
; ----- prepare registers to small loop

InvSubStr_x64_4:
		mov		rcx,r9				; RCX <- len
		and		rcx,LOOPMASK		; RCX <- number of small loops
		jz		InvSubStr_x64_8		; no small loop

; ----- small loop

InvSubStr_x64_6:
		rcr		rax,1				; CY <- carry
		mov		rax,[r8]			; RAX <- [src]
		sbb		rax,[rdx]			; sub dst from src
		mov		[rdx],rax			; save result
		rcl		rax,1				; RAX bit 0 <- carry
		add		r8,8				; shift src
		add		rdx,8				; shift dst
		loop	InvSubStr_x64_6,rcx	; loop next step

; ----- carry

InvSubStr_x64_8:
		and		rax,1				; RAX <- carry
		ret


; =============================================================================
;                         MUL data string by word
; =============================================================================
; inputs: rcx=u64 carry, rdx=u64 num, r8=u64* dst, r9=u64* src, [rsp+28h]=s64 len
; output: rax=u64 carry

global MulStr_x64

; ----- prepare registers to big loop
; RDX, RAX = temporary, R10 = carry, R11 = num, R8 = dst, R9 = src, RCX = loop counter, [rsp+28h] = old len

MulStr_x64:
		mov		r10,rcx				; R10 <- carry
		mov		r11,rdx				; R11 <- num
		mov		rcx,[rsp+28h]		; RCX <- len
		test	rcx,rcx				; check len
		jle		MulStr_x64_8		; no data
		shr		rcx,LOOPBIT			; RCX <- number of big loops
		jz		MulStr_x64_4		; no big loop
		
; ----- big loop

MulStr_x64_2:
%assign LOOPINX 0
%rep	LOOPNUM
		mov		rax,[r9+LOOPINX]	; RAX <- [src]
		mul		r11					; RDX:RAX <- [src] * num
		add		rax,r10				; add carry
		adc		rdx,0				; carry		
		mov		[r8+LOOPINX],rax	; save result
		mov		r10,rdx				; R10 <- new carry
%assign	LOOPINX	LOOPINX+8
%endrep
		add		r8,8*LOOPNUM		; shift dst
		add		r9,8*LOOPNUM		; shift src
		dec		rcx					; loop counter
		jnz		MulStr_x64_2		; loop next step
		
; ----- prepare registers to small loop

MulStr_x64_4:
		mov		rcx,[rsp+28h]		; RCX <- len
		and		rcx,LOOPMASK		; RCX <- number of small loops
		jz		MulStr_x64_8		; no small loop

; ----- small loop

MulStr_x64_6:
		mov		rax,[r9]			; RAX <- [src]
		mul		r11					; RDX:RAX <- [src] * num
		add		rax,r10				; add carry
		adc		rdx,0				; carry		
		mov		[r8],rax			; save result
		mov		r10,rdx				; R10 <- new carry
		add		r8,8				; shift dst
		add		r9,8				; shift src
		loop	MulStr_x64_6,rcx	; loop next step

; ----- carry

MulStr_x64_8:
		mov		rax,r10				; RAX <- carry
		ret

; =============================================================================
;                         MUL ADD data string by word
; =============================================================================
; inputs: rcx=u64 carry, rdx=u64 num, r8=u64* dst, r9=u64* src, [rsp+28h]=s64 len
; output: rax=u64 carry

global MulAddStr_x64

; ----- prepare registers to big loop
; RDX, RAX = temporary, R10 = carry, R11 = num, R8 = dst, R9 = src, RCX = loop counter, [rsp+28h] = old len

MulAddStr_x64:
		mov		r10,rcx				; R10 <- carry
		mov		r11,rdx				; R11 <- num
		mov		rcx,[rsp+28h]		; RCX <- len
		test	rcx,rcx				; check len
		jle		MulAddStr_x64_8		; no data
		shr		rcx,LOOPBIT			; RCX <- number of big loops
		jz		MulAddStr_x64_4		; no big loop
		
; ----- big loop

MulAddStr_x64_2:
%assign LOOPINX 0
%rep	LOOPNUM
		mov		rax,[r9+LOOPINX]	; RAX <- [src]
		mul		r11					; RDX:RAX <- [src] * num
		add		rax,r10				; add carry
		adc		rdx,0				; carry		
		add		[r8+LOOPINX],rax	; add [dst]
		adc		rdx,0				; carry		
		mov		r10,rdx				; R10 <- new carry
%assign	LOOPINX	LOOPINX+8
%endrep
		add		r8,8*LOOPNUM		; shift dst
		add		r9,8*LOOPNUM		; shift src
		dec		rcx					; loop counter
		jnz		MulAddStr_x64_2		; loop next step
		
; ----- prepare registers to small loop

MulAddStr_x64_4:
		mov		rcx,[rsp+28h]		; RCX <- len
		and		rcx,LOOPMASK		; RCX <- number of small loops
		jz		MulAddStr_x64_8		; no small loop

; ----- small loop

MulAddStr_x64_6:
		mov		rax,[r9]			; RAX <- [src]
		mul		r11					; RDX:RAX <- [src] * num
		add		rax,r10				; add carry
		adc		rdx,0				; carry		
		add		[r8],rax			; add [dst]
		adc		rdx,0				; carry		
		mov		r10,rdx				; R10 <- new carry
		add		r8,8				; shift dst
		add		r9,8				; shift src
		loop	MulAddStr_x64_6,rcx	; loop next step

; ----- carry

MulAddStr_x64_8:
		mov		rax,r10				; RAX <- carry
		ret

; =============================================================================
;                    Add square of entries of data string
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src, r8=s64 len

global AddSqrStr_x64

; ----- prepare registers to big loop
; RDX, RAX = temporary, R11 = carry, R9 = dst, R10 = src, RCX = loop counter, R8 = old len

AddSqrStr_x64:
		xor		r11,r11				; R11 <- 0, carry
		mov		r9,rcx				; R9 <- dst
		mov		r10,rdx				; R10 <- src
		mov		rcx,r8				; RCX <- len
		test	rcx,rcx				; check len
		jle		AddSqrStr_x64_8		; no data
		shr		rcx,LOOPBIT			; RCX <- number of big loops
		jz		AddSqrStr_x64_4		; no big loop
		
; ----- big loop

AddSqrStr_x64_2:
%assign LOOPINX 0
%rep	LOOPNUM
		mov		rax,[r10+LOOPINX]	; RAX <- [src]
		mul		rax					; RDX:RAX <- [src] * [src]
		add		rax,r11				; add carry
		adc		rdx,0				; carry HIGH
		add		rax,[r9+2*LOOPINX]	; add [dst]
		adc		rdx,0				; carry HIGH
		mov		[r9+2*LOOPINX],rax	; save result LOW
		xor		r11,r11				; R11 <- 0, no carry
		add		rdx,[r9+2*LOOPINX+8] ; add [dst+1]
		adc		r11,r11				; R11 <- carry
		mov		[r9+2*LOOPINX+8],rdx ; save result HIGH
%assign	LOOPINX	LOOPINX+8
%endrep
		add		r9,2*8*LOOPNUM		; shift dst
		add		r10,8*LOOPNUM		; shift src
		dec		rcx					; loop counter
		jnz		AddSqrStr_x64_2		; loop next step
		
; ----- prepare registers to small loop

AddSqrStr_x64_4:
		mov		rcx,r8				; RCX <- len
		and		rcx,LOOPMASK		; RCX <- number of small loops
		jz		AddSqrStr_x64_8		; no small loop

; ----- small loop

AddSqrStr_x64_6:
		mov		rax,[r10]			; RAX <- [src]
		mul		rax					; RDX:RAX <- [src] * [src]
		add		rax,r11				; add carry
		adc		rdx,0				; carry HIGH
		add		rax,[r9]			; add [dst]
		adc		rdx,0				; carry HIGH
		mov		[r9],rax			; save result LOW
		xor		r11,r11				; R11 <- 0, no carry
		add		rdx,[r9+8]			; add [dst+1]
		adc		r11,r11				; R11 <- carry
		mov		[r9+8],rdx			; save result HIGH
		add		r9,2*8				; shift dst
		add		r10,8				; shift src
		loop	AddSqrStr_x64_6,rcx	; loop next step

AddSqrStr_x64_8:
		ret

; =============================================================================
;                         DIV data string by word
; =============================================================================
; inputs: rcx=u64 carry, rdx=u64 num, r8=u64* dst, r9=u64* src, [rsp+28h]=s64 len
; output: rax=u64 carry

global DivStr_x64

; ----- prepare registers to big loop
; RAX = temporary, RDX = carry, R11 = num, R8 = dst, R9 = src, RCX = loop counter, [rsp+28h] = old len

DivStr_x64:
		mov		r11,rdx				; R11 <- num (divisor)
		mov		rdx,rcx				; R10 <- carry
		mov		rcx,[rsp+28h]		; RCX <- len
		test	rcx,rcx				; check len
		jle		DivStr_x64_8		; no data
		lea		r8,[r8+rcx*8-8]		; R8 <- dst last entry
		lea		r9,[r9+rcx*8-8]		; R9 <- src last entry
		shr		rcx,LOOPBIT			; RCX <- number of big loops
		jz		DivStr_x64_4		; no big loop
		
; ----- big loop

DivStr_x64_2:
%assign LOOPINX 0
%rep	LOOPNUM
		mov		rax,[r9-LOOPINX]	; RAX <- [src]
		div		r11					; RAX quotient, RDX remainder <- RDX:[src] / num
		mov		[r8-LOOPINX],rax	; save result
%assign	LOOPINX	LOOPINX+8
%endrep
		sub		r8,8*LOOPNUM		; shift dst
		sub		r9,8*LOOPNUM		; shift src
		dec		rcx					; loop counter
		jnz		DivStr_x64_2		; loop next step
		
; ----- prepare registers to small loop

DivStr_x64_4:
		mov		rcx,[rsp+28h]		; RCX <- len
		and		rcx,LOOPMASK		; RCX <- number of small loops
		jz		DivStr_x64_8		; no small loop

; ----- small loop

DivStr_x64_6:
		mov		rax,[r9]			; RAX <- [src]
		div		r11					; RAX quotient, RDX remainder <- RDX:[src] / num
		mov		[r8],rax			; save result
		sub		r8,8				; shift dst
		sub		r9,8				; shift src
		loop	DivStr_x64_6,rcx	; loop next step

; ----- carry

DivStr_x64_8:
		mov		rax,rdx				; RAX <- carry
		ret

; =============================================================================
;                   Expand number division string by word
; =============================================================================
; inputs: rcx=u64 carry, rdx=u64 num, r8=u64* dst, r9=s64 maxlen
; output: rax=s64 length of expansion

global DivUExpStr_x64

; ----- prepare registers to big loop
; RAX = temporary, RDX = carry, R11 = num, R8 = dst, R9 = maxlen, R10 = len, RCX = loop counter

DivUExpStr_x64:
		mov		r11,rdx				; R11 <- num (divisor)
		xor		r10,r10				; R10 <- 0, len counter
		mov		rdx,rcx				; R10 <- carry
		mov		rcx,r9				; RCX <- maxlen
		test	rcx,rcx				; check maxlen
		jle		DivUExpStr_x64_8	; no data
		shr		rcx,LOOPBIT			; RCX <- number of big loops
		jz		DivUExpStr_x64_4	; no big loop
		
; ----- big loop

DivUExpStr_x64_2:
%assign LOOPINX 0
%rep	LOOPNUM
%assign	LOOPINX	LOOPINX+8
		or		rdx,rdx				; any reminder?
		jz		DivUExpStr_x64_8	; end of expansion
		xor		rax,rax				; RAX <- 0
		div		r11					; RAX quotient, RDX remainder <- RDX:0 / num
		mov		[r8-LOOPINX],rax	; save result
		inc		r10					; increase len counter
%endrep
		sub		r8,8*LOOPNUM		; shift dst
		dec		rcx					; loop counter
		jnz		DivUExpStr_x64_2	; loop next step
		
; ----- prepare registers to small loop

DivUExpStr_x64_4:
		mov		rcx,r9				; RCX <- maxlen
		and		rcx,LOOPMASK		; RCX <- number of small loops
		jz		DivUExpStr_x64_8	; no small loop

; ----- small loop

DivUExpStr_x64_6:
		or		rdx,rdx				; any reminder?
		jz		DivUExpStr_x64_8	; end of expansion
		xor		rax,rax				; RAX <- 0
		div		r11					; RAX quotient, RDX remainder <- RDX:0 / num
		mov		[r8-8],rax			; save result
		inc		r10					; increase len counter
		sub		r8,8				; shift dst
		loop	DivUExpStr_x64_6,rcx ; loop next step

; ----- len

DivUExpStr_x64_8:
		mov		rax,r10				; RAX <- len
		ret

; =============================================================================
;                       Check instruction timings
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src

global Test_x64

Test_x64:
		push	rsi
		push	rdi
		mov		rdi,rcx
		mov		rsi,rdx
		mov		r9,rdi
		mov		r10,rsi
				
		mov		rcx,1000000



Test_x64_1:

		;mov		rax,[rsi]
		;inc		rsi
		;mov		[rdi],rax
		;inc		rdi
		
		mov		rdi,r9
		mov		rsi,r10		
		push	rcx

		mov		rcx,1756567576567
%rep 1000
		adc		rax,0
%endrep

		pop		rcx
		dec		rcx
		jnz		Test_x64_1
		
		
		
		pop		rdi
		pop		rsi
		ret
		
		
; Timings Intel Core 2 Duo 3 GHz:
;	push+pop ... 1.7 ns
;	nop ... 0.11 ns
;	xor		rdx,rdx ... 0.11 ns
;	mov		rdx,75644334897678 ... 0.22 ns
;	mov		cl,23 ... 0.11 ns
;	mov		rax,rdx ... 0.11 ns
;	xchg	rax,rdx ... 0.66 ns
;	shld	rdx,rax,cl ... 0.22 ns
;	add		rax,rcx ... 0.33 ns
;	adc		rax,rcx ... 0.66 ns
;	adc		rax,0 ... 0.66 ns
;	sub		rax,rcx ... 0.33 ns
;	sbb		rax,rcx ... 0.66 ns
;	and		rax,rcx ... 0.33 ns
;	mul		rcx ... 1.3 ns
;	mul		ecx ... 0.44 ns
;	mul		cx ... 0.44 ns
;	mul		cl ... 0.11 ns
;	div		rcx ... 18 ns
;	div		ecx ... 4.5 ns
;	div		cx ... 3.5 ns
;	div		cl ... 3 ns
;	inc		rcx ... 0.33 ns
;	mov		rax,[rsi] ... 0.33 ns
;	mov		rax,[rsi+8] ... 0.33 ns
;	mov		[rdi],rax ... 0.33 ns
;	lodsq ... 0.66 ns (mov rax + inc ... 0.66 ns)
;	rep lodsq ... 1.0 ns (per one step)
;	stosq ... 0.33 ns (mov rax + inc ... 0.77 ns)
;	rep stosw ... 0.33 ns (per one step)
;	loop,rcx ... 1.7 ns
;	jnz	... 0.1 ns
;	dec + jnz ... 0.33 ns
;	movsq ... 1.4 ns (mov rax + inc + mov rax + inc ... 1.4 ns
;	rep movsq ... 0.4 ns (per one step) (mov rax + jnz ... 1.6 ns



